# -*- coding: utf-8 -*-
"""
Created on Mon Jul 19 17:14:06 2021
@author: Neguine
"""
'''import packages'''
import statistics as s
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import collections
import os
import xlsxwriter
import xlrd

def save_batch_file(batch_file_commands,batchfilename):
    with open(batchfilename,'w') as outfile:
        outfile.write(batch_file_commands)
             
def read_in_txtdata(inputfile):
    tf1 = []
    tf2 = []
    tf3=[]
    with open(inputfile, "r+") as infile:
        tf1 = infile.read()
        tf2 = tf1.splitlines()
        infile.close
    for i in range(len(tf2)):
        temp = tf2[i].replace('Uhh ','')
        if tf2[i][0] != '(' and tf2[i] != '.':
            tf3.append(temp)
    return tf3
 
def save_to_temp_file(textlist,temp_filename):
    with open(temp_filename,'w') as outfile:
        for i in range(len(textlist)):
            outfile.write(textlist[i]+'\n')

def parsed_txt(textlist,batchfilename,temp_filename):
    parser_out = os.popen(batchfilename+' '+temp_filename)
    return list(parser_out)

def get_sentences(Txt,i):
    e = []
    Sentence = []    
    for j in range(len(Txt)):
        if Txt[j][0] =='\n':
            e.append(j)
    num = int(len(e)/6) # num is the number of sentences for each subject
    cnt = 0
    S = []
    A = []
    U = []
    I = []
    Utt = []
    for k in range(num):
        cnt += 1
        X = Txt[e[k*6+2]+1:e[k*6+3]]
        Y = Txt[e[k*6+1]+1]
        Y = Y.replace('/NNP','/NN')
        sentence = Txt[e[k*6+1]-1][:-3]
        Sentence.append(sentence)
        Y1=Y.split(' ')
        s = ''
        utt = 0
        for y3 in range(len(Y1)):
            if Y1[y3][0] not in '.?!,':
                utt += 1
        verb2 = ''
        for w in range(len(Y1)):
            W = Y1[w].split('/')[0]
            W0 = Y1[w]
            if Y1[w].split('/')[1][0:2] == 'VB' :       
                verb1=(Y1[w].split('/')[0]).lower()
                verb2=lemmatizer.lemmatize(verb1,pos='v')+'/VB'
            if verb2 in D:
                W1 = W + ' [' + str(D[verb2]) + ']'
                if D[verb2] <43 or 64 > D[verb2]>55:
                    U.append(D[verb2])
                if 42 < D[verb2] < 56 or D[verb2] == 64:
                    A.append(D[verb2])
            if Y1[w].split('/')[1][0:2] != 'VB'and W0 in D:
                W1 = W + ' [' + str(D[Y1[w]]) + ']'
                if D[Y1[w]] <43 or 64 > D[Y1[w]]>55:
                    U.append(D[Y1[w]])
                if 42 < D[Y1[w]] < 56 or D[Y1[w]] == 64:
                    A.append(D[Y1[w]])
                if D[Y1[w]] == 65:
                    I.append(D[Y1[w]])            
            if Y1[w].split('/')[1][0:2] == 'VB':
                if verb2 in D:
                    W1 = W1
                if verb2 not in D:
                    W1 = W
            if Y1[w].split('/')[1][0:2] != 'VB' and W0 not in D:
                W1 = W
            if Y1[w].split('/')[1][0:3] == 'VBZ' : 
                W1 = W1
            s += W1 + ' '
        S.append(s)
        Utt.append(utt)
    return S, num, A, U, I, Utt

''' file names:'''
cur_dir = os.getcwd()
outfilename = 'C:\\Users\\negui\\Desktop\\R projects\\Jerry\\Results\\result1.xlsx'
data_dir = 'C:\\Users\\negui\\Desktop\\DATA\\picnic_controls_spoken'
contents = os.listdir(data_dir)
temp_filename = 'tempi.txt'
batchfilename = 'lexparser3.bat'
batch_file_commands = 'java -mx800m -cp "*;" edu.stanford.nlp.parser.lexparser.LexicalizedParser -outputFormat "words,typedDependenciesCollapsed,wordsAndTags," -printPCFGkBest 1 edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz %1'

'''calling functions'''
lemmatizer = WordNetLemmatizer()
#read in the dictionary
xl = 'C:\\Users\\negui\\Desktop\\R projects\\Sylvia\\CU_list.xlsx'
wb = xlrd.open_workbook(xl) 
sheet = wb.sheet_by_index(0) 
CuN = sheet.col_values(1)
Cu = sheet.col_values(2)

D= {}
D1 = {}
for i in range(len(Cu)):
    C = Cu[i].split(', ')
    for j in range(len(C)):
        D[C[j]] = CuN[i]
DL = list(D)
for d in range(len(DL)):
    D1[DL[d][0].upper()+DL[d][1:]] = D[DL[d]]
D.update(D1)
        
save_batch_file(batch_file_commands,batchfilename)
workbook = xlsxwriter.Workbook(outfilename)
worksheet = workbook.add_worksheet()
worksheet.write_string(0, 0, 'SubjectID')
worksheet.write_string(0, 2, 'Ambig')
worksheet.write_string(0, 3, 'Unambig')
worksheet.write_string(0, 4, 'SelfRef')
worksheet.write_string(0, 5, 'SentenceCount')
worksheet.write_string(0, 6, 'Utterance')

#for i in range(len(contents)):
for i in range(1,2):    
    print('subject ' + str(i+1), contents[i],)
    cleanTxt = read_in_txtdata(data_dir + '\\' + contents[i])
    save_to_temp_file(cleanTxt,temp_filename)
    parsedTxt = parsed_txt(cleanTxt,batchfilename,temp_filename)
    S , num, A, U, I, Utt = get_sentences(parsedTxt,i)
    name = cur_dir + contents[i] + '.txt'
    with open(name,'w') as output:
        for row in S:
            output.write(row + '\n')
    for o in range(len(contents)):

        worksheet.write_string(i+1, 0, contents[i])
#        worksheet.write_string(i+1, 1, dic[contents[i][0:4]])
        worksheet.write_number(i+1, 2, len(collections.Counter(A)))
        worksheet.write_number(i+1, 3, len(collections.Counter(U)))
        worksheet.write_number(i+1, 4, len(collections.Counter(I)))
        worksheet.write_number(i+1, 5, num) 
        worksheet.write_number(i+1, 6, sum(Utt))   
workbook.close()